{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Local Multimodal pipeline with OpenVINO\n",
    "\n",
    "[OpenVINO™](https://github.com/openvinotoolkit/openvino) is an open-source toolkit for optimizing and deploying AI inference. The OpenVINO™ Runtime supports various hardware [devices](https://github.com/openvinotoolkit/openvino?tab=readme-ov-file#supported-hardware-matrix) including x86 and ARM CPUs, and Intel GPUs. It can help to boost deep learning performance in Computer Vision, Automatic Speech Recognition, Natural Language Processing and other common tasks.\n",
    "\n",
    "Hugging Face multimodal model can be supported by OpenVINO through ``OpenVINOMultiModal`` class."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install llama-index-multi-modal-llms-openvino -q"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install llama-index llama-index-readers-file -q"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Export and compress multimodal model\n",
    "\n",
    "It is possible to [export your model](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#export) to the OpenVINO IR format with the CLI, and load the model from local folder."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "model_id = \"llava-hf/llava-v1.6-mistral-7b-hf\"\n",
    "model_path = Path(model_id.split(\"/\")[-1]) / \"FP16\"\n",
    "\n",
    "if not model_path.exists():\n",
    "    !optimum-cli export openvino --model {model_id} --weight-format fp16 {model_path}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:nncf:Statistics of the bitwidth distribution:\n",
      "┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
      "│   Num bits (N) │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │\n",
      "┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
      "│              8 │ 2% (1 / 225)                │ 0% (0 / 224)                           │\n",
      "├────────────────┼─────────────────────────────┼────────────────────────────────────────┤\n",
      "│              4 │ 98% (224 / 225)             │ 100% (224 / 224)                       │\n",
      "┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7255112d94f2405da307504a1bdc2ad4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import shutil\n",
    "import nncf\n",
    "import openvino as ov\n",
    "import gc\n",
    "\n",
    "core = ov.Core()\n",
    "\n",
    "compression_config = {\n",
    "    \"mode\": nncf.CompressWeightsMode.INT4_SYM,\n",
    "    \"group_size\": 64,\n",
    "    \"ratio\": 0.6,\n",
    "}\n",
    "\n",
    "compressed_model_path = model_path.parent / \"INT4\"\n",
    "if not compressed_model_path.exists():\n",
    "    ov_model = core.read_model(model_path / \"openvino_language_model.xml\")\n",
    "    compressed_ov_model = nncf.compress_weights(ov_model, **compression_config)\n",
    "    ov.save_model(\n",
    "        compressed_ov_model,\n",
    "        compressed_model_path / \"openvino_language_model.xml\",\n",
    "    )\n",
    "    del compressed_ov_model\n",
    "    del ov_model\n",
    "    gc.collect()\n",
    "    for file_name in model_path.glob(\"*\"):\n",
    "        if file_name.name in [\n",
    "            \"openvino_language_model.xml\",\n",
    "            \"openvino_language_model.bin\",\n",
    "        ]:\n",
    "            continue\n",
    "        shutil.copy(file_name, compressed_model_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prepare the input data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.makedirs(\"./input_images\", exist_ok=True)\n",
    "\n",
    "url = \"https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg\"\n",
    "image = Image.open(requests.get(url, stream=True).raw)\n",
    "\n",
    "image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.multi_modal_llms.openvino import OpenVINOMultiModal\n",
    "from transformers import AutoProcessor\n",
    "\n",
    "processor = AutoProcessor.from_pretrained(\n",
    "    \"llava-v1.6-mistral-7b-hf/INT4\", trust_remote_code=True\n",
    ")\n",
    "\n",
    "\n",
    "def messages_to_prompt(messages, image_documents):\n",
    "    \"\"\"\n",
    "    Prepares the input messages and images.\n",
    "    \"\"\"\n",
    "    conversation = [{\"type\": \"text\", \"text\": messages[0].content}]\n",
    "    images = []\n",
    "    for img_doc in image_documents:\n",
    "        images.append(img_doc)\n",
    "        conversation.append({\"type\": \"image\"})\n",
    "    messages = [\n",
    "        {\"role\": \"user\", \"content\": conversation}\n",
    "    ]  # Wrap conversation in a user role\n",
    "\n",
    "    print(messages)\n",
    "\n",
    "    # Apply a chat template to format the message with the processor\n",
    "    text_prompt = processor.apply_chat_template(\n",
    "        messages, add_generation_prompt=True\n",
    "    )\n",
    "\n",
    "    # Prepare the model inputs (text + images) and convert to tensor\n",
    "    inputs = processor(text=text_prompt, images=images, return_tensors=\"pt\")\n",
    "    return inputs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model Loading\n",
    "\n",
    "Models can be loaded by specifying the model parameters using the `OpenVINOMultiModal` method.\n",
    "\n",
    "If you have an Intel GPU, you can specify `device_map=\"gpu\"` to run inference on it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vlm = OpenVINOMultiModal(\n",
    "    model_id_or_path=\"llava-v1.6-mistral-7b-hf/INT4\",\n",
    "    device=\"cpu\",\n",
    "    messages_to_prompt=messages_to_prompt,\n",
    "    generate_kwargs={\"do_sample\": False},\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Inference with local OpenVINO model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'role': 'user', 'content': [{'type': 'text', 'text': 'Describe the images'}, {'type': 'image'}]}]\n",
      "The image shows a person and a dog on a sandy beach. The person is sitting on the sand, facing the camera, and appears to be smiling. They are wearing a plaid shirt and dark pants. The dog is standing next to the person, looking up at the person's hand, which is extended towards the dog. The dog is wearing a harness and has a collar with a tag. The background features the ocean with waves, and the sky is clear with a warm glow, suggesting either sunrise or sunset. The overall atmosphere of the image is peaceful and joyful, capturing a moment of interaction between the person and the dog. \n"
     ]
    }
   ],
   "source": [
    "response = vlm.complete(\"Describe the images\", image_documents=[image])\n",
    "print(response.text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'role': 'user', 'content': [{'type': 'text', 'text': 'Describe the images'}, {'type': 'image'}]}]\n",
      "The image shows a person and a dog on a sandy beach. The person is sitting on the sand, facing the camera, and appears to be smiling. They are wearing a plaid shirt and dark pants. The dog is standing next to the person, looking up at the person's hand, which is extended towards the dog. The dog is wearing a harness and has a collar with a tag. The background features the ocean with waves, and the sky is clear with a warm glow, suggesting either sunrise or sunset. The overall atmosphere of the image is peaceful and joyful, capturing a moment of interaction between the person and the dog. "
     ]
    }
   ],
   "source": [
    "response = vlm.stream_complete(\"Describe the images\", image_documents=[image])\n",
    "for r in response:\n",
    "    print(r.delta, end=\"\")"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
